package cqut.wys.crawler;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.w3c.dom.Node;

import java.io.IOException;

/**
 * Created by wuyoushan on 2017/11/20.
 */
public class HtmlUnitCrawler {

    public static void main(String[] args) {
        try {
            get16sucaiByHtmlUnit();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 爬取www.16sucai.com网站通过HtmlUnit包
     *
     * @throws IOException IOException对象
     */
    private static void get16sucaiByHtmlUnit() throws IOException {
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        HtmlPage htmlPage = webClient.getPage("http://www.16sucai.com/tupian/gqfj/3.html");
        DomNodeList<DomNode> domElements = htmlPage.querySelectorAll(".vector_listbox_pubu > ul > li > a");
        for (int i=0;i<domElements.size();i++) {
            DomNode domNode= domElements.get(i);
            Node node = domNode.getAttributes().getNamedItem("href");
            String src ="http://www.16sucai.com"+node.getNodeValue();
            System.out.println(src);
        }
        webClient.close();
    }
}
